In [914]:
!pip install arch
Requirement already satisfied: arch in /usr/local/lib/python3.7/dist-packages (5.1.0)
Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from arch) (1.19.5)
Requirement already satisfied: property-cached>=1.6.4 in /usr/local/lib/python3.7/dist-packages (from arch) (1.6.4)
Requirement already satisfied: statsmodels>=0.11 in /usr/local/lib/python3.7/dist-packages (from arch) (0.13.1)
Requirement already satisfied: scipy>=1.3 in /usr/local/lib/python3.7/dist-packages (from arch) (1.4.1)
Requirement already satisfied: pandas>=1.0 in /usr/local/lib/python3.7/dist-packages (from arch) (1.1.5)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=1.0->arch) (2.8.2)
Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas>=1.0->arch) (2018.9)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas>=1.0->arch) (1.15.0)
Requirement already satisfied: patsy>=0.5.2 in /usr/local/lib/python3.7/dist-packages (from statsmodels>=0.11->arch) (0.5.2)
In [915]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
from scipy import stats

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import MinMaxScaler
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

import ipywidgets as widgets
from IPython.display import display
In [1478]:
w = widgets.Dropdown(
    options=['SELECT','AAPL', 'ABUS', 'ARDS', 'BABA','BFRI', 
             'FB', 'GME', 'MCD','PFE', 'PLUG', 
             'QCOM', 'SENS','TSLA', 'TWTR', 'UUUU'],
    value='SELECT',
    description ='Stock name:',

)

def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print("You have selected %s" % change['new'])

w.observe(on_change)

display(w)
You have selected GME
In [1485]:
if(w.value == 'AAPL'):
  df = pd.read_csv('/content/Final_AAPL.csv')
if(w.value == 'ABUS'):
  df = pd.read_csv('/content/Final_ABUS.csv')
if(w.value == 'ARDS'):
  df = pd.read_csv('/content/Final_ARDS.csv')
if(w.value == 'BABA'):
  df = pd.read_csv('/content/Final_BABA.csv')
if(w.value == 'BFRI'):
  df = pd.read_csv('/content/Final_BFRI.csv')
if(w.value == 'FB'):
  df = pd.read_csv('/content/Final_FB.csv')
if(w.value == 'GME'):
  df = pd.read_csv('/content/Final_GME.csv')
if(w.value == 'MCD'):
  df = pd.read_csv('/content/Final_MCD.csv')
if(w.value == 'PFE'):
  df = pd.read_csv('/content/Final_PFE.csv')
if(w.value == 'PLUG'):
  df = pd.read_csv('/content/Final_PLUG.csv')
if(w.value == 'QCOM'):
  df = pd.read_csv('/content/Final_QCOM.csv')
if(w.value == 'SENS'):
  df = pd.read_csv('/content/Final_SENS.csv')
if(w.value == 'TSLA'):
  df = pd.read_csv('/content/Final_TSLA.csv')
if(w.value == 'TWTR'):
  df = pd.read_csv('/content/Final_TWTR.csv')
if(w.value == 'UUUU'):
  df = pd.read_csv('/content/Final_UUUU.csv')
In [1486]:
pd.set_option('display.max_colwidth', None)
In [1487]:
df['Date'] = df['Date'].astype("datetime64[ns]")
In [1488]:
del df['Unnamed: 0']
In [1489]:
df.head(5)
Out[1489]:
Date Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet
0 2020-11-20 12.55 13.46 12.54 12.71 12.71 8395400 2.006421 -0.148870 0.355139 1.029680 12.987585 10.603844 11.795714 NaN 8.101337 1.00 65.494886 NaN NaN NaN 0.85 NaN 0.071670 49.901721 NaN NaN 66.833373 59.452616 -4.313529e+07 -1.622475e+06 16067800.0 0.0 4.039610e+06 0.0 0.0 0.0 0.0 0.0 4.039610e+06 0.0 4.039610e+06 0.0 4.039610e+06 3 214 217 0 217 0 217 217
1 2020-11-23 12.90 14.12 12.67 13.90 13.90 9599100 9.362703 -0.230230 0.767812 1.089726 13.943926 10.438931 12.191429 -0.391218 7.839753 1.45 72.320706 NaN NaN NaN 2.41 -3.185713 0.209748 57.176797 NaN NaN 83.024414 69.372552 -3.644902e+07 3.479938e+05 25666900.0 0.0 2.805894e+06 0.0 0.0 0.0 0.0 0.0 2.805894e+06 0.0 2.805894e+06 0.0 2.805894e+06 2 200 202 0 202 0 202 202
2 2020-11-24 14.23 14.26 13.30 13.67 13.67 7183200 -1.654673 -0.387980 0.736326 1.071193 14.287618 10.855239 12.571429 -0.231154 7.836089 0.96 73.612524 NaN NaN NaN 2.57 -1.883716 0.231531 55.499280 NaN NaN 81.339609 77.065799 -3.809517e+07 5.986833e+05 18483700.0 0.0 2.458240e+06 0.0 0.0 0.0 0.0 0.0 2.458240e+06 0.0 2.458240e+06 0.0 2.458240e+06 4 129 133 0 133 0 133 133
3 2020-11-25 13.60 15.25 13.42 14.75 14.75 8860100 7.900511 -0.241312 1.229310 1.179594 15.173199 10.738229 12.955714 0.006282 7.997249 1.83 80.948138 NaN NaN NaN 3.00 0.051052 0.255319 61.248593 NaN NaN 86.077732 83.480585 -3.407666e+07 1.925431e+06 27343800.0 0.0 2.914776e+06 0.0 0.0 0.0 0.0 0.0 2.914776e+06 0.0 2.914776e+06 0.0 2.914776e+06 3 229 232 0 232 0 232 232
4 2020-11-27 15.29 16.74 14.88 16.08 16.08 12504200 9.016949 0.376598 1.968555 1.295367 16.397533 10.785324 13.591428 0.345128 8.055762 1.99 87.197458 NaN NaN NaN 4.95 2.787443 0.444744 66.917060 NaN 55.821979 83.377127 83.598156 -3.044641e+07 3.448232e+06 39848000.0 0.0 6.893015e+06 0.0 0.0 0.0 0.0 0.0 6.893015e+06 0.0 6.893015e+06 0.0 6.893015e+06 6 295 301 0 301 0 301 301
In [1490]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271 entries, 0 to 270
Data columns (total 52 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Date                       271 non-null    datetime64[ns]
 1   Open                       271 non-null    float64       
 2   High                       271 non-null    float64       
 3   Low                        271 non-null    float64       
 4   Close                      271 non-null    float64       
 5   Adj Close                  271 non-null    float64       
 6   Volume                     271 non-null    int64         
 7   Return                     271 non-null    float64       
 8   Beta                       271 non-null    float64       
 9   Variance                   271 non-null    float64       
 10  AvgTrueRange               271 non-null    float64       
 11  Upperband                  271 non-null    float64       
 12  Lowerband                  271 non-null    float64       
 13  Middleband                 271 non-null    float64       
 14  APO                        270 non-null    float64       
 15  NATR                       271 non-null    float64       
 16  TRANGE                     271 non-null    float64       
 17  DMI                        271 non-null    float64       
 18  MACD                       262 non-null    float64       
 19  MACDSIGNAL                 262 non-null    float64       
 20  MACDHIST                   262 non-null    float64       
 21  MOM                        271 non-null    float64       
 22  PPO                        270 non-null    float64       
 23  ROCP                       271 non-null    float64       
 24  RSI                        271 non-null    float64       
 25  TRIX                       207 non-null    float64       
 26  ULTOSC                     267 non-null    float64       
 27  SLOWK                      271 non-null    float64       
 28  SLOWD                      271 non-null    float64       
 29  AD                         271 non-null    float64       
 30  ADOSC                      271 non-null    float64       
 31  OBV                        271 non-null    float64       
 32  Upward_momentum_created    271 non-null    float64       
 33  Downward_momentum_created  271 non-null    float64       
 34  B5_O_Um                    271 non-null    float64       
 35  B5_C_Um                    271 non-null    float64       
 36  B5_E_Um                    271 non-null    float64       
 37  B5_A_Um                    271 non-null    float64       
 38  B5_N_Um                    271 non-null    float64       
 39  B5_O_Dm                    271 non-null    float64       
 40  B5_C_Dm                    271 non-null    float64       
 41  B5_E_Dm                    271 non-null    float64       
 42  B5_A_Dm                    271 non-null    float64       
 43  B5_N_Dm                    271 non-null    float64       
 44  Verified_status_True       271 non-null    int64         
 45  Verified_status_False      271 non-null    int64         
 46  O                          271 non-null    int64         
 47  C                          271 non-null    int64         
 48  E                          271 non-null    int64         
 49  A                          271 non-null    int64         
 50  N                          271 non-null    int64         
 51  Real_or_Fake_tweet         271 non-null    int64         
dtypes: datetime64[ns](1), float64(42), int64(9)
memory usage: 110.2 KB
In [1491]:
df.shape
Out[1491]:
(271, 52)
In [1492]:
sns.set(font_scale=0.8)
In [1493]:
# CHANGE CONTEXT TO poster TO INCREASE FONT SIZES
sns.set_context("talk", font_scale=1.3)

# PLOT OUT BTC-USE'S CLOSING PRICES SINCE 2014
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(18,8))
    sns.lineplot(x=df.Date, y=df.Close, color='blue')
    ax.set_title('Closing Price')    
In [1494]:
# CALCULATE PRICE RETURNS AS DAILY PERCENTAGE CHANGE USING pct_change()
df['returns'] = 100 * df.Close.pct_change().dropna()
In [1495]:
# CALCULATE LOG RETURNS BASED ON ABOVE FORMULA
df['log_returns'] = np.log(df.Close/df.Close.shift(1))
In [1496]:
df.head()
Out[1496]:
Date Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet returns log_returns
0 2020-11-20 12.55 13.46 12.54 12.71 12.71 8395400 2.006421 -0.148870 0.355139 1.029680 12.987585 10.603844 11.795714 NaN 8.101337 1.00 65.494886 NaN NaN NaN 0.85 NaN 0.071670 49.901721 NaN NaN 66.833373 59.452616 -4.313529e+07 -1.622475e+06 16067800.0 0.0 4.039610e+06 0.0 0.0 0.0 0.0 0.0 4.039610e+06 0.0 4.039610e+06 0.0 4.039610e+06 3 214 217 0 217 0 217 217 NaN NaN
1 2020-11-23 12.90 14.12 12.67 13.90 13.90 9599100 9.362703 -0.230230 0.767812 1.089726 13.943926 10.438931 12.191429 -0.391218 7.839753 1.45 72.320706 NaN NaN NaN 2.41 -3.185713 0.209748 57.176797 NaN NaN 83.024414 69.372552 -3.644902e+07 3.479938e+05 25666900.0 0.0 2.805894e+06 0.0 0.0 0.0 0.0 0.0 2.805894e+06 0.0 2.805894e+06 0.0 2.805894e+06 2 200 202 0 202 0 202 202 9.362703 0.089500
2 2020-11-24 14.23 14.26 13.30 13.67 13.67 7183200 -1.654673 -0.387980 0.736326 1.071193 14.287618 10.855239 12.571429 -0.231154 7.836089 0.96 73.612524 NaN NaN NaN 2.57 -1.883716 0.231531 55.499280 NaN NaN 81.339609 77.065799 -3.809517e+07 5.986833e+05 18483700.0 0.0 2.458240e+06 0.0 0.0 0.0 0.0 0.0 2.458240e+06 0.0 2.458240e+06 0.0 2.458240e+06 4 129 133 0 133 0 133 133 -1.654673 -0.016685
3 2020-11-25 13.60 15.25 13.42 14.75 14.75 8860100 7.900511 -0.241312 1.229310 1.179594 15.173199 10.738229 12.955714 0.006282 7.997249 1.83 80.948138 NaN NaN NaN 3.00 0.051052 0.255319 61.248593 NaN NaN 86.077732 83.480585 -3.407666e+07 1.925431e+06 27343800.0 0.0 2.914776e+06 0.0 0.0 0.0 0.0 0.0 2.914776e+06 0.0 2.914776e+06 0.0 2.914776e+06 3 229 232 0 232 0 232 232 7.900511 0.076039
4 2020-11-27 15.29 16.74 14.88 16.08 16.08 12504200 9.016949 0.376598 1.968555 1.295367 16.397533 10.785324 13.591428 0.345128 8.055762 1.99 87.197458 NaN NaN NaN 4.95 2.787443 0.444744 66.917060 NaN 55.821979 83.377127 83.598156 -3.044641e+07 3.448232e+06 39848000.0 0.0 6.893015e+06 0.0 0.0 0.0 0.0 0.0 6.893015e+06 0.0 6.893015e+06 0.0 6.893015e+06 6 295 301 0 301 0 301 301 9.016949 0.086333
In [1497]:
# DROPPING THE 1ST ROW OF DATA 
# BECAUSE I SHIFTED IT FORWARD TO CALCULATE RETURNS/LOG RETURNS
df.dropna(inplace=True)
In [1498]:
# PLOT DISTRIBUTION PLOTS OF RETURNS & LOG RETURNS
# AND VISUALLY COMPARE THEM WITH THE STANDARD NORMAL DISTRIBUTION
with sns.axes_style("darkgrid"):
    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(18,12))

    axes[0][0].plot(df.returns, color='blue')
    axes[0][0].set_title('Returns')

    sns.distplot(df.returns, norm_hist=True, fit=stats.norm, color='blue',
                bins=50, ax=axes[0][1])
    axes[0][1].set_title('Returns')

    axes[1][0].plot(df.log_returns, color='green')
    axes[1][0].set_title('Log Returns')

    sns.distplot(df.log_returns, norm_hist=True, fit=stats.norm, color='green',
                bins=50, ax=axes[1][1])
    axes[1][1].set_title('Log Returns')
    plt.tight_layout()
    fig.show();
In [1499]:
# CREATE A FUNCTION THAT CALCULATE REALIZED VOLATILITY
# FROM SAILY LOG RETURNS
def realized_volatility_daily(series_log_return):
    """
    Get the daily realized volatility which is calculated as the square root
    of sum of squares of log returns within a specific window interval 
    """
    n = len(series_log_return)
    return np.sqrt(np.sum(series_log_return**2)/(n - 1))
In [1500]:
intervals = [7, 30, 60, 180, 365]
vols_df = {}

# ITERATE OVER intervals LIST
for i in intervals:
    # GET DAILY LOG RETURNS USING THAT INTERVAL
    vols = df.log_returns.rolling(window=i)\
                         .apply(realized_volatility_daily).values

    vols_df[i] = vols

# CONVERT vols_df FROM DICTIONARY TO PANDAS DATAFRAME
vols_df = pd.DataFrame(vols_df, columns=intervals, index=df.index)
In [1501]:
# CHANGING MATPLOTLIB STYLE
plt.style.use(['fivethirtyeight'])

fig, ax = plt.subplots(figsize=(18,7))

for i in intervals:
    if i == 7:
        alpha = 0.5
        lw = 1
    else:
        alpha = 1.0
        lw = 2
    ax.plot(vols_df[i], label=f'{i}-Day Interval Realized Volatility', 
            alpha=alpha, lw=lw)

ax.set_title('Realized Volatility Using Different Interval Windows', fontsize=21)

plt.legend(loc='best', prop={'size': 14})
plt.show();
In [1502]:
INTERVAL_WINDOW = 30
n_future = 7

# GET BACKWARD LOOKING REALIZED VOLATILITY
df['vol_current'] = df.log_returns.rolling(window=INTERVAL_WINDOW)\
                                   .apply(realized_volatility_daily)

# GET FORWARD LOOKING REALIZED VOLATILITY 
df['vol_future'] = df.log_returns.shift(-n_future)\
                                 .rolling(window=INTERVAL_WINDOW)\
                                 .apply(realized_volatility_daily)
In [1503]:
df.describe()
Out[1503]:
Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet returns log_returns vol_current vol_future
count 207.000000 207.000000 207.000000 207.000000 207.000000 2.070000e+02 207.000000 207.000000 207.000000 207.000000 207.000000 207.000000 207.000000 207.000000 207.000000 207.000000 207.000000 207.000000 207.000000 207.000000 207.000000 207.000000 207.000000 207.000000 207.000000 207.000000 207.000000 207.000000 2.070000e+02 2.070000e+02 2.070000e+02 207.0 2.070000e+02 207.0 207.0 207.0 207.0 207.0 2.070000e+02 207.0 2.070000e+02 207.0 2.070000e+02 207.000000 207.000000 207.000000 207.0 207.000000 207.0 207.000000 207.000000 207.000000 207.000000 178.000000 171.000000
mean 187.597488 197.537440 178.427343 187.103043 187.103043 8.462967e+06 0.575055 0.551097 258.195866 20.138345 210.682573 160.875080 185.778826 3.050232 10.890637 19.850435 36.405828 3.797127 3.953580 -0.156453 5.085169 2.309358 0.129043 50.748245 0.799605 45.758042 44.359167 44.381942 -2.475499e+08 -2.059015e+06 1.253085e+09 0.0 1.686618e+07 0.0 0.0 0.0 0.0 0.0 1.686618e+07 0.0 1.686618e+07 0.0 1.686618e+07 23.299517 2395.420290 2418.719807 0.0 2418.719807 0.0 2418.719807 2418.719807 0.575055 0.002555 0.064757 0.061346
std 32.779206 36.217505 30.649501 32.481793 32.481793 1.553779e+07 8.261050 0.473493 483.743900 11.548804 40.259944 35.428416 31.991064 22.857675 6.433320 18.997533 25.742918 12.460902 10.836341 5.253349 47.887780 15.972755 0.558327 10.627835 1.189472 6.853607 21.648726 19.947213 1.481241e+07 7.595282e+06 3.839607e+07 0.0 2.108665e+07 0.0 0.0 0.0 0.0 0.0 2.108665e+07 0.0 2.108665e+07 0.0 2.108665e+07 34.934851 1887.586691 1920.386738 0.0 1920.386738 0.0 1920.386738 1920.386738 8.261050 0.079272 0.032311 0.028093
min 104.540001 127.750000 86.000000 101.739998 101.739998 8.170000e+05 -33.788173 -1.035069 2.461732 8.207407 111.898036 7.710536 59.804286 -39.899167 4.447977 4.430008 1.342826 -14.650800 -11.504427 -10.324166 -144.660004 -41.563797 -0.545887 29.694287 -0.347303 29.841911 5.285988 6.560340 -2.737380e+08 -4.674117e+07 1.070773e+09 0.0 2.631992e+06 0.0 0.0 0.0 0.0 0.0 2.631992e+06 0.0 2.631992e+06 0.0 2.631992e+06 2.000000 960.000000 962.000000 0.0 962.000000 0.0 962.000000 962.000000 -33.788173 -0.412311 0.026954 0.026954
25% 163.720001 173.254997 158.934998 164.875000 164.875000 2.078850e+06 -2.878621 0.239682 33.784304 12.839828 186.694517 143.855679 167.255713 -11.963141 7.107317 9.040001 13.808580 -3.916909 -2.369103 -4.078278 -21.625000 -6.632109 -0.108796 42.300854 -0.006155 41.366079 25.856948 26.892865 -2.572258e+08 -2.213235e+06 1.248134e+09 0.0 7.110452e+06 0.0 0.0 0.0 0.0 0.0 7.110452e+06 0.0 7.110452e+06 0.0 7.110452e+06 7.000000 1468.500000 1480.000000 0.0 1480.000000 0.0 1480.000000 1480.000000 -2.878621 -0.029209 0.037009 0.036915
50% 183.000000 190.199997 176.149994 183.509995 183.509995 3.637500e+06 -0.169761 0.583654 68.545835 16.188048 203.000656 166.500148 181.825716 -1.258463 8.454730 13.910004 32.304798 0.930359 1.512840 -0.385839 -4.850006 -0.757182 -0.025438 49.722342 0.213994 44.569060 44.653556 44.299547 -2.482722e+08 -1.172837e+06 1.256641e+09 0.0 9.857086e+06 0.0 0.0 0.0 0.0 0.0 9.857086e+06 0.0 9.857086e+06 0.0 9.857086e+06 12.000000 1819.000000 1833.000000 0.0 1833.000000 0.0 1833.000000 1833.000000 -0.169761 -0.001699 0.055467 0.055141
75% 206.474998 215.775002 198.919998 205.794998 205.794998 7.928400e+06 2.491689 0.849307 239.841759 23.749896 225.371296 184.969773 207.012858 12.635994 11.874248 21.790001 53.720243 8.474013 7.360273 1.885374 19.845001 6.563387 0.119799 57.409790 0.936071 50.686769 62.570839 61.931554 -2.389040e+08 -1.476984e+05 1.273236e+09 0.0 1.793602e+07 0.0 0.0 0.0 0.0 0.0 1.793602e+07 0.0 1.793602e+07 0.0 1.793602e+07 25.500000 2678.000000 2700.500000 0.0 2700.500000 0.0 2700.500000 2700.500000 2.491689 0.024611 0.080072 0.077179
max 303.119995 348.500000 291.510010 302.559998 302.559998 1.503088e+08 52.692376 1.622262 3433.626003 59.525724 325.171336 232.070237 274.338571 74.173526 38.378684 176.500000 90.855730 46.195273 35.685519 19.532892 201.929993 59.598259 4.490327 76.273269 4.266893 66.032776 89.234937 82.492823 -1.877761e+08 1.522601e+07 1.343634e+09 0.0 2.247220e+08 0.0 0.0 0.0 0.0 0.0 2.247220e+08 0.0 2.247220e+08 0.0 2.247220e+08 300.000000 20149.000000 20449.000000 0.0 20449.000000 0.0 20449.000000 20449.000000 52.692376 0.423255 0.150254 0.148385
In [1504]:
df.rename(columns = {'Real_or_Fake_tweet': 'Fake_news'}, inplace = True)
In [1505]:
df = df.fillna(df.median())
In [1506]:
df.isna().sum()
Out[1506]:
Date                         0
Open                         0
High                         0
Low                          0
Close                        0
Adj Close                    0
Volume                       0
Return                       0
Beta                         0
Variance                     0
AvgTrueRange                 0
Upperband                    0
Lowerband                    0
Middleband                   0
APO                          0
NATR                         0
TRANGE                       0
DMI                          0
MACD                         0
MACDSIGNAL                   0
MACDHIST                     0
MOM                          0
PPO                          0
ROCP                         0
RSI                          0
TRIX                         0
ULTOSC                       0
SLOWK                        0
SLOWD                        0
AD                           0
ADOSC                        0
OBV                          0
Upward_momentum_created      0
Downward_momentum_created    0
B5_O_Um                      0
B5_C_Um                      0
B5_E_Um                      0
B5_A_Um                      0
B5_N_Um                      0
B5_O_Dm                      0
B5_C_Dm                      0
B5_E_Dm                      0
B5_A_Dm                      0
B5_N_Dm                      0
Verified_status_True         0
Verified_status_False        0
O                            0
C                            0
E                            0
A                            0
N                            0
Fake_news                    0
returns                      0
log_returns                  0
vol_current                  0
vol_future                   0
dtype: int64
In [1507]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 207 entries, 64 to 270
Data columns (total 56 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Date                       207 non-null    datetime64[ns]
 1   Open                       207 non-null    float64       
 2   High                       207 non-null    float64       
 3   Low                        207 non-null    float64       
 4   Close                      207 non-null    float64       
 5   Adj Close                  207 non-null    float64       
 6   Volume                     207 non-null    int64         
 7   Return                     207 non-null    float64       
 8   Beta                       207 non-null    float64       
 9   Variance                   207 non-null    float64       
 10  AvgTrueRange               207 non-null    float64       
 11  Upperband                  207 non-null    float64       
 12  Lowerband                  207 non-null    float64       
 13  Middleband                 207 non-null    float64       
 14  APO                        207 non-null    float64       
 15  NATR                       207 non-null    float64       
 16  TRANGE                     207 non-null    float64       
 17  DMI                        207 non-null    float64       
 18  MACD                       207 non-null    float64       
 19  MACDSIGNAL                 207 non-null    float64       
 20  MACDHIST                   207 non-null    float64       
 21  MOM                        207 non-null    float64       
 22  PPO                        207 non-null    float64       
 23  ROCP                       207 non-null    float64       
 24  RSI                        207 non-null    float64       
 25  TRIX                       207 non-null    float64       
 26  ULTOSC                     207 non-null    float64       
 27  SLOWK                      207 non-null    float64       
 28  SLOWD                      207 non-null    float64       
 29  AD                         207 non-null    float64       
 30  ADOSC                      207 non-null    float64       
 31  OBV                        207 non-null    float64       
 32  Upward_momentum_created    207 non-null    float64       
 33  Downward_momentum_created  207 non-null    float64       
 34  B5_O_Um                    207 non-null    float64       
 35  B5_C_Um                    207 non-null    float64       
 36  B5_E_Um                    207 non-null    float64       
 37  B5_A_Um                    207 non-null    float64       
 38  B5_N_Um                    207 non-null    float64       
 39  B5_O_Dm                    207 non-null    float64       
 40  B5_C_Dm                    207 non-null    float64       
 41  B5_E_Dm                    207 non-null    float64       
 42  B5_A_Dm                    207 non-null    float64       
 43  B5_N_Dm                    207 non-null    float64       
 44  Verified_status_True       207 non-null    int64         
 45  Verified_status_False      207 non-null    int64         
 46  O                          207 non-null    int64         
 47  C                          207 non-null    int64         
 48  E                          207 non-null    int64         
 49  A                          207 non-null    int64         
 50  N                          207 non-null    int64         
 51  Fake_news                  207 non-null    int64         
 52  returns                    207 non-null    float64       
 53  log_returns                207 non-null    float64       
 54  vol_current                207 non-null    float64       
 55  vol_future                 207 non-null    float64       
dtypes: datetime64[ns](1), float64(46), int64(9)
memory usage: 92.2 KB
In [1508]:
df.shape
Out[1508]:
(207, 56)
In [1509]:
df=df.dropna()
In [1510]:
df.dtypes
Out[1510]:
Date                         datetime64[ns]
Open                                float64
High                                float64
Low                                 float64
Close                               float64
Adj Close                           float64
Volume                                int64
Return                              float64
Beta                                float64
Variance                            float64
AvgTrueRange                        float64
Upperband                           float64
Lowerband                           float64
Middleband                          float64
APO                                 float64
NATR                                float64
TRANGE                              float64
DMI                                 float64
MACD                                float64
MACDSIGNAL                          float64
MACDHIST                            float64
MOM                                 float64
PPO                                 float64
ROCP                                float64
RSI                                 float64
TRIX                                float64
ULTOSC                              float64
SLOWK                               float64
SLOWD                               float64
AD                                  float64
ADOSC                               float64
OBV                                 float64
Upward_momentum_created             float64
Downward_momentum_created           float64
B5_O_Um                             float64
B5_C_Um                             float64
B5_E_Um                             float64
B5_A_Um                             float64
B5_N_Um                             float64
B5_O_Dm                             float64
B5_C_Dm                             float64
B5_E_Dm                             float64
B5_A_Dm                             float64
B5_N_Dm                             float64
Verified_status_True                  int64
Verified_status_False                 int64
O                                     int64
C                                     int64
E                                     int64
A                                     int64
N                                     int64
Fake_news                             int64
returns                             float64
log_returns                         float64
vol_current                         float64
vol_future                          float64
dtype: object
In [1511]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(40,15))
sns.heatmap(df.corr(),annot=True)
Out[1511]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0778d55150>
In [1512]:
df.hist(figsize=(20, 32), bins=70, xlabelsize=8, ylabelsize=8);
In [1513]:
df_corr = df.corr()['AvgTrueRange'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with AvgTrueRange:\n{}".format(len(golden_features_list), golden_features_list))
There are 22 strongly correlated values with AvgTrueRange:
AvgTrueRange                 1.000000
NATR                         0.882044
MACD                         0.838387
MACDSIGNAL                   0.836769
TRIX                         0.833292
Variance                     0.756546
TRANGE                       0.680338
APO                          0.654977
PPO                          0.643011
AD                           0.639551
Volume                       0.601623
Upperband                    0.587197
N                            0.549347
O                            0.549347
E                            0.549347
Fake_news                    0.549347
Verified_status_False        0.548890
Verified_status_True         0.540514
Downward_momentum_created    0.534221
B5_O_Dm                      0.534221
B5_E_Dm                      0.534221
B5_N_Dm                      0.534221
Name: AvgTrueRange, dtype: float64
In [1514]:
df_corr = df.corr()['NATR'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with NATR :\n{}".format(len(golden_features_list), golden_features_list))
There are 20 strongly correlated values with NATR :
NATR                         1.000000
TRIX                         0.910356
AvgTrueRange                 0.882044
Volume                       0.731121
MACDSIGNAL                   0.633795
TRANGE                       0.585450
MACD                         0.584282
Variance                     0.571231
B5_N_Dm                      0.569413
B5_E_Dm                      0.569413
B5_O_Dm                      0.569413
Downward_momentum_created    0.569413
Fake_news                    0.559271
N                            0.559271
O                            0.559271
E                            0.559271
Verified_status_False        0.558984
Verified_status_True         0.540588
Lowerband                   -0.538360
ADOSC                       -0.563501
Name: NATR, dtype: float64
In [1515]:
df_corr = df.corr()['TRANGE'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with TRANGE:\n{}".format(len(golden_features_list), golden_features_list))
There are 18 strongly correlated values with TRANGE:
TRANGE                       1.000000
Fake_news                    0.834970
E                            0.834970
O                            0.834970
N                            0.834970
Verified_status_False        0.834722
Verified_status_True         0.797366
B5_O_Dm                      0.756951
Downward_momentum_created    0.756951
B5_E_Dm                      0.756951
B5_N_Dm                      0.756951
Volume                       0.742234
Variance                     0.683316
AvgTrueRange                 0.680338
NATR                         0.585450
MACD                         0.560686
High                         0.538455
AD                           0.502956
Name: TRANGE, dtype: float64
In [1516]:
df_corr = df.corr()['O'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Openness:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with Openness:
Fake_news                    1.000000
N                            1.000000
E                            1.000000
O                            1.000000
Verified_status_False        0.999980
B5_N_Dm                      0.956468
B5_E_Dm                      0.956468
B5_O_Dm                      0.956468
Downward_momentum_created    0.956468
Verified_status_True         0.939970
Volume                       0.884955
TRANGE                       0.834970
NATR                         0.559271
Variance                     0.551751
AvgTrueRange                 0.549347
ROCP                         0.500273
Name: O, dtype: float64
In [1517]:
df_corr = df.corr()['C'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: C, dtype: float64)
In [1518]:
df_corr = df.corr()['E'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with conscientiousness:
Fake_news                    1.000000
N                            1.000000
E                            1.000000
O                            1.000000
Verified_status_False        0.999980
B5_N_Dm                      0.956468
B5_E_Dm                      0.956468
B5_O_Dm                      0.956468
Downward_momentum_created    0.956468
Verified_status_True         0.939970
Volume                       0.884955
TRANGE                       0.834970
NATR                         0.559271
Variance                     0.551751
AvgTrueRange                 0.549347
ROCP                         0.500273
Name: E, dtype: float64
In [1519]:
df_corr = df.corr()['A'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: A, dtype: float64)
In [1520]:
df_corr = df.corr()['N'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with conscientiousness:
Fake_news                    1.000000
N                            1.000000
E                            1.000000
O                            1.000000
Verified_status_False        0.999980
B5_N_Dm                      0.956468
B5_E_Dm                      0.956468
B5_O_Dm                      0.956468
Downward_momentum_created    0.956468
Verified_status_True         0.939970
Volume                       0.884955
TRANGE                       0.834970
NATR                         0.559271
Variance                     0.551751
AvgTrueRange                 0.549347
ROCP                         0.500273
Name: N, dtype: float64
In [1521]:
df.columns
Out[1521]:
Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Return',
       'Beta', 'Variance', 'AvgTrueRange', 'Upperband', 'Lowerband',
       'Middleband', 'APO', 'NATR', 'TRANGE', 'DMI', 'MACD', 'MACDSIGNAL',
       'MACDHIST', 'MOM', 'PPO', 'ROCP', 'RSI', 'TRIX', 'ULTOSC', 'SLOWK',
       'SLOWD', 'AD', 'ADOSC', 'OBV', 'Upward_momentum_created',
       'Downward_momentum_created', 'B5_O_Um', 'B5_C_Um', 'B5_E_Um', 'B5_A_Um',
       'B5_N_Um', 'B5_O_Dm', 'B5_C_Dm', 'B5_E_Dm', 'B5_A_Dm', 'B5_N_Dm',
       'Verified_status_True', 'Verified_status_False', 'O', 'C', 'E', 'A',
       'N', 'Fake_news', 'returns', 'log_returns', 'vol_current',
       'vol_future'],
      dtype='object')
In [1522]:
df_corr = df.corr()['B5_O_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_O_Um:
Series([], Name: B5_O_Um, dtype: float64)
In [1523]:
df_corr = df.corr()['B5_C_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_C_Um:
Series([], Name: B5_C_Um, dtype: float64)
In [1524]:
df_corr = df.corr()['B5_E_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_E_Um:
Series([], Name: B5_E_Um, dtype: float64)
In [1525]:
df_corr = df.corr()['B5_A_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Um:
Series([], Name: B5_A_Um, dtype: float64)
In [1526]:
df_corr = df.corr()['B5_N_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_N_Um:
Series([], Name: B5_N_Um, dtype: float64)

Downward momentum correlation

In [1527]:
df_corr = df.corr()['B5_O_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 14 strongly correlated values with B5_O_Dm:
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.959060
Fake_news                    0.956468
N                            0.956468
E                            0.956468
O                            0.956468
Verified_status_False        0.955338
Volume                       0.855199
TRANGE                       0.756951
NATR                         0.569413
AvgTrueRange                 0.534221
Name: B5_O_Dm, dtype: float64
In [1528]:
df_corr = df.corr()['B5_C_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_C_Dm:
Series([], Name: B5_C_Dm, dtype: float64)
In [1529]:
df_corr = df.corr()['B5_E_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 14 strongly correlated values with B5_E_Dm:
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.959060
Fake_news                    0.956468
N                            0.956468
E                            0.956468
O                            0.956468
Verified_status_False        0.955338
Volume                       0.855199
TRANGE                       0.756951
NATR                         0.569413
AvgTrueRange                 0.534221
Name: B5_E_Dm, dtype: float64
In [1530]:
df_corr = df.corr()['B5_A_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Dm:
Series([], Name: B5_A_Dm, dtype: float64)
In [1531]:
df_corr = df.corr()['B5_N_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 14 strongly correlated values with B5_N_Dm:
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.959060
Fake_news                    0.956468
N                            0.956468
E                            0.956468
O                            0.956468
Verified_status_False        0.955338
Volume                       0.855199
TRANGE                       0.756951
NATR                         0.569413
AvgTrueRange                 0.534221
Name: B5_N_Dm, dtype: float64
In [1532]:
df_corr = df.corr()['Fake_news'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Real_or_Fake_tweet :\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with Real_or_Fake_tweet :
Fake_news                    1.000000
N                            1.000000
E                            1.000000
O                            1.000000
Verified_status_False        0.999980
B5_N_Dm                      0.956468
B5_E_Dm                      0.956468
B5_O_Dm                      0.956468
Downward_momentum_created    0.956468
Verified_status_True         0.939970
Volume                       0.884955
TRANGE                       0.834970
NATR                         0.559271
Variance                     0.551751
AvgTrueRange                 0.549347
ROCP                         0.500273
Name: Fake_news, dtype: float64
In [1533]:
df_corr = df.corr()['Downward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Downward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 14 strongly correlated values with Downward_momentum_created :
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.959060
Fake_news                    0.956468
N                            0.956468
E                            0.956468
O                            0.956468
Verified_status_False        0.955338
Volume                       0.855199
TRANGE                       0.756951
NATR                         0.569413
AvgTrueRange                 0.534221
Name: Downward_momentum_created, dtype: float64
In [1534]:
df_corr = df.corr()['Upward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Upward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with Upward_momentum_created :
Series([], Name: Upward_momentum_created, dtype: float64)
In [1535]:
df_corr = df.corr()['Verified_status_True'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_True :\n{}".format(len(golden_features_list), golden_features_list))
There are 15 strongly correlated values with Verified_status_True :
Verified_status_True         1.000000
B5_N_Dm                      0.959060
B5_E_Dm                      0.959060
B5_O_Dm                      0.959060
Downward_momentum_created    0.959060
Fake_news                    0.939970
N                            0.939970
E                            0.939970
O                            0.939970
Verified_status_False        0.937795
Volume                       0.805903
TRANGE                       0.797366
NATR                         0.540588
AvgTrueRange                 0.540514
Variance                     0.507919
Name: Verified_status_True, dtype: float64
In [1536]:
df_corr = df.corr()['Verified_status_False'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_False :\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with Verified_status_False :
Verified_status_False        1.000000
Fake_news                    0.999980
N                            0.999980
E                            0.999980
O                            0.999980
B5_N_Dm                      0.955338
B5_E_Dm                      0.955338
B5_O_Dm                      0.955338
Downward_momentum_created    0.955338
Verified_status_True         0.937795
Volume                       0.885417
TRANGE                       0.834722
NATR                         0.558984
Variance                     0.551938
AvgTrueRange                 0.548890
ROCP                         0.500822
Name: Verified_status_False, dtype: float64
In [1537]:
sns.set(font_scale=0.8)
In [1538]:
for i in range(0, len(df.columns), 5):
    sns.pairplot(data=df,
                x_vars=df.columns[i:i+5],
                y_vars=['NATR'])
In [1539]:
df.dtypes
Out[1539]:
Date                         datetime64[ns]
Open                                float64
High                                float64
Low                                 float64
Close                               float64
Adj Close                           float64
Volume                                int64
Return                              float64
Beta                                float64
Variance                            float64
AvgTrueRange                        float64
Upperband                           float64
Lowerband                           float64
Middleband                          float64
APO                                 float64
NATR                                float64
TRANGE                              float64
DMI                                 float64
MACD                                float64
MACDSIGNAL                          float64
MACDHIST                            float64
MOM                                 float64
PPO                                 float64
ROCP                                float64
RSI                                 float64
TRIX                                float64
ULTOSC                              float64
SLOWK                               float64
SLOWD                               float64
AD                                  float64
ADOSC                               float64
OBV                                 float64
Upward_momentum_created             float64
Downward_momentum_created           float64
B5_O_Um                             float64
B5_C_Um                             float64
B5_E_Um                             float64
B5_A_Um                             float64
B5_N_Um                             float64
B5_O_Dm                             float64
B5_C_Dm                             float64
B5_E_Dm                             float64
B5_A_Dm                             float64
B5_N_Dm                             float64
Verified_status_True                  int64
Verified_status_False                 int64
O                                     int64
C                                     int64
E                                     int64
A                                     int64
N                                     int64
Fake_news                             int64
returns                             float64
log_returns                         float64
vol_current                         float64
vol_future                          float64
dtype: object
In [1540]:
df.isnull().sum()
Out[1540]:
Date                         0
Open                         0
High                         0
Low                          0
Close                        0
Adj Close                    0
Volume                       0
Return                       0
Beta                         0
Variance                     0
AvgTrueRange                 0
Upperband                    0
Lowerband                    0
Middleband                   0
APO                          0
NATR                         0
TRANGE                       0
DMI                          0
MACD                         0
MACDSIGNAL                   0
MACDHIST                     0
MOM                          0
PPO                          0
ROCP                         0
RSI                          0
TRIX                         0
ULTOSC                       0
SLOWK                        0
SLOWD                        0
AD                           0
ADOSC                        0
OBV                          0
Upward_momentum_created      0
Downward_momentum_created    0
B5_O_Um                      0
B5_C_Um                      0
B5_E_Um                      0
B5_A_Um                      0
B5_N_Um                      0
B5_O_Dm                      0
B5_C_Dm                      0
B5_E_Dm                      0
B5_A_Dm                      0
B5_N_Dm                      0
Verified_status_True         0
Verified_status_False        0
O                            0
C                            0
E                            0
A                            0
N                            0
Fake_news                    0
returns                      0
log_returns                  0
vol_current                  0
vol_future                   0
dtype: int64
In [1541]:
df.fillna(0, inplace = True)
In [1542]:
df.dropna(inplace=True)
In [1543]:
sns.set(font_scale=0.8)
In [1544]:
corr = df.drop('Close', axis=1).corr() 
plt.figure(figsize=(12, 10))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            cmap='YlGnBu', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);
In [1545]:
df.describe()
Out[1545]:
Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Fake_news returns log_returns vol_current vol_future
count 207.000000 207.000000 207.000000 207.000000 207.000000 2.070000e+02 207.000000 207.000000 207.000000 207.000000 207.000000 207.000000 207.000000 207.000000 207.000000 207.000000 207.000000 207.000000 207.000000 207.000000 207.000000 207.000000 207.000000 207.000000 207.000000 207.000000 207.000000 207.000000 2.070000e+02 2.070000e+02 2.070000e+02 207.0 2.070000e+02 207.0 207.0 207.0 207.0 207.0 2.070000e+02 207.0 2.070000e+02 207.0 2.070000e+02 207.000000 207.000000 207.000000 207.0 207.000000 207.0 207.000000 207.000000 207.000000 207.000000 207.000000 207.000000
mean 187.597488 197.537440 178.427343 187.103043 187.103043 8.462967e+06 0.575055 0.551097 258.195866 20.138345 210.682573 160.875080 185.778826 3.050232 10.890637 19.850435 36.405828 3.797127 3.953580 -0.156453 5.085169 2.309358 0.129043 50.748245 0.799605 45.758042 44.359167 44.381942 -2.475499e+08 -2.059015e+06 1.253085e+09 0.0 1.686618e+07 0.0 0.0 0.0 0.0 0.0 1.686618e+07 0.0 1.686618e+07 0.0 1.686618e+07 23.299517 2395.420290 2418.719807 0.0 2418.719807 0.0 2418.719807 2418.719807 0.575055 0.002555 0.063456 0.060267
std 32.779206 36.217505 30.649501 32.481793 32.481793 1.553779e+07 8.261050 0.473493 483.743900 11.548804 40.259944 35.428416 31.991064 22.857675 6.433320 18.997533 25.742918 12.460902 10.836341 5.253349 47.887780 15.972755 0.558327 10.627835 1.189472 6.853607 21.648726 19.947213 1.481241e+07 7.595282e+06 3.839607e+07 0.0 2.108665e+07 0.0 0.0 0.0 0.0 0.0 2.108665e+07 0.0 2.108665e+07 0.0 2.108665e+07 34.934851 1887.586691 1920.386738 0.0 1920.386738 0.0 1920.386738 1920.386738 8.261050 0.079272 0.030124 0.025629
min 104.540001 127.750000 86.000000 101.739998 101.739998 8.170000e+05 -33.788173 -1.035069 2.461732 8.207407 111.898036 7.710536 59.804286 -39.899167 4.447977 4.430008 1.342826 -14.650800 -11.504427 -10.324166 -144.660004 -41.563797 -0.545887 29.694287 -0.347303 29.841911 5.285988 6.560340 -2.737380e+08 -4.674117e+07 1.070773e+09 0.0 2.631992e+06 0.0 0.0 0.0 0.0 0.0 2.631992e+06 0.0 2.631992e+06 0.0 2.631992e+06 2.000000 960.000000 962.000000 0.0 962.000000 0.0 962.000000 962.000000 -33.788173 -0.412311 0.026954 0.026954
25% 163.720001 173.254997 158.934998 164.875000 164.875000 2.078850e+06 -2.878621 0.239682 33.784304 12.839828 186.694517 143.855679 167.255713 -11.963141 7.107317 9.040001 13.808580 -3.916909 -2.369103 -4.078278 -21.625000 -6.632109 -0.108796 42.300854 -0.006155 41.366079 25.856948 26.892865 -2.572258e+08 -2.213235e+06 1.248134e+09 0.0 7.110452e+06 0.0 0.0 0.0 0.0 0.0 7.110452e+06 0.0 7.110452e+06 0.0 7.110452e+06 7.000000 1468.500000 1480.000000 0.0 1480.000000 0.0 1480.000000 1480.000000 -2.878621 -0.029209 0.039926 0.039926
50% 183.000000 190.199997 176.149994 183.509995 183.509995 3.637500e+06 -0.169761 0.583654 68.545835 16.188048 203.000656 166.500148 181.825716 -1.258463 8.454730 13.910004 32.304798 0.930359 1.512840 -0.385839 -4.850006 -0.757182 -0.025438 49.722342 0.213994 44.569060 44.653556 44.299547 -2.482722e+08 -1.172837e+06 1.256641e+09 0.0 9.857086e+06 0.0 0.0 0.0 0.0 0.0 9.857086e+06 0.0 9.857086e+06 0.0 9.857086e+06 12.000000 1819.000000 1833.000000 0.0 1833.000000 0.0 1833.000000 1833.000000 -0.169761 -0.001699 0.055467 0.055141
75% 206.474998 215.775002 198.919998 205.794998 205.794998 7.928400e+06 2.491689 0.849307 239.841759 23.749896 225.371296 184.969773 207.012858 12.635994 11.874248 21.790001 53.720243 8.474013 7.360273 1.885374 19.845001 6.563387 0.119799 57.409790 0.936071 50.686769 62.570839 61.931554 -2.389040e+08 -1.476984e+05 1.273236e+09 0.0 1.793602e+07 0.0 0.0 0.0 0.0 0.0 1.793602e+07 0.0 1.793602e+07 0.0 1.793602e+07 25.500000 2678.000000 2700.500000 0.0 2700.500000 0.0 2700.500000 2700.500000 2.491689 0.024611 0.074099 0.069507
max 303.119995 348.500000 291.510010 302.559998 302.559998 1.503088e+08 52.692376 1.622262 3433.626003 59.525724 325.171336 232.070237 274.338571 74.173526 38.378684 176.500000 90.855730 46.195273 35.685519 19.532892 201.929993 59.598259 4.490327 76.273269 4.266893 66.032776 89.234937 82.492823 -1.877761e+08 1.522601e+07 1.343634e+09 0.0 2.247220e+08 0.0 0.0 0.0 0.0 0.0 2.247220e+08 0.0 2.247220e+08 0.0 2.247220e+08 300.000000 20149.000000 20449.000000 0.0 20449.000000 0.0 20449.000000 20449.000000 52.692376 0.423255 0.150254 0.148385
In [1546]:
# DROPPING ALL NaN VALUES
df.dropna(inplace=True)
In [1547]:
n_zoom = 365
sns.set_context("talk", font_scale=1.3)
# plt.style.use(['seaborn'])

# VISUALIZE REALIZED CURRENT VS. FUTURE VOLATILITY
with sns.axes_style("whitegrid"):
    fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(18,14))

    ax1.plot(df.vol_current, alpha=.8, lw=1, color='gray', ls=':',
            label='Current Volatility')
    ax1.plot(df.vol_future, lw=1, color='blue',
            label=f'Next {n_future} Days Volatility (TARGET)')

    ax2.plot(df.vol_current[-n_zoom:], alpha=.8, lw=2, color='gray', ls=':',
            label='Current Volatility')
    ax2.plot(df.vol_future[-n_zoom:], lw=2, color='blue',
            label=f'Next {n_future} Days Volatility (TARGET)')

    ax1.title.set_text(f'Future vs. Current Daily Volatility \n Using {INTERVAL_WINDOW}-Day Interval')
    ax2.title.set_text(f'Zooming in the Last {n_zoom} Days')

    ax1.legend(loc='upper left', prop={'size': 13}, frameon=True)
    ax2.legend(loc='upper left', prop={'size': 13}, frameon=True)
    plt.tight_layout()
    
    plt.show();

Daily Volatility Distribution

In [1548]:
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(10,6))
    sns.distplot(df.vol_current, norm_hist=True, fit=stats.norm,
                bins=50, ax=ax)
    plt.title('Daily Volatility Distribution')
    
    plt.show();

Experiment 2: weekly granularity

In [1553]:
w = widgets.Dropdown(
    options=['SELECT','AAPL', 'ABUS', 'ARDS', 'BABA','BFRI', 
             'FB', 'GME', 'MCD','PFE', 'PLUG', 
             'QCOM', 'SENS','TSLA', 'TWTR', 'UUUU'],
    value='SELECT',
    description ='Stock name:',

)

def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print("You have selected %s" % change['new'])

w.observe(on_change)

display(w)
You have selected GME
In [1554]:
if(w.value == 'AAPL'):
  df = pd.read_csv('/content/Final_AAPL.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'ABUS'):
  df = pd.read_csv('/content/Final_ABUS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'ARDS'):
  df = pd.read_csv('/content/Final_ARDS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'BABA'):
  df = pd.read_csv('/content/Final_BABA.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'BFRI'):
  df = pd.read_csv('/content/Final_BFRI.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'FB'):
  df = pd.read_csv('/content/Final_FB.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'GME'):
  df = pd.read_csv('/content/Final_GME.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'MCD'):
  df = pd.read_csv('/content/Final_MCD.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'PFE'):
  df = pd.read_csv('/content/Final_PFE.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'PLUG'):
  df = pd.read_csv('/content/Final_PLUG.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'QCOM'):
  df = pd.read_csv('/content/Final_QCOM.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'SENS'):
  df = pd.read_csv('/content/Final_SENS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'TSLA'):
  df = pd.read_csv('/content/Final_TSLA.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'TWTR'):
  df = pd.read_csv('/content/Final_TWTR.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'UUUU'):
  df = pd.read_csv('/content/Final_UUUU.csv', parse_dates=['Date'], index_col=['Date'])
In [1555]:
df.columns
Out[1555]:
Index(['Unnamed: 0', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
       'Return', 'Beta', 'Variance', 'AvgTrueRange', 'Upperband', 'Lowerband',
       'Middleband', 'APO', 'NATR', 'TRANGE', 'DMI', 'MACD', 'MACDSIGNAL',
       'MACDHIST', 'MOM', 'PPO', 'ROCP', 'RSI', 'TRIX', 'ULTOSC', 'SLOWK',
       'SLOWD', 'AD', 'ADOSC', 'OBV', 'Upward_momentum_created',
       'Downward_momentum_created', 'B5_O_Um', 'B5_C_Um', 'B5_E_Um', 'B5_A_Um',
       'B5_N_Um', 'B5_O_Dm', 'B5_C_Dm', 'B5_E_Dm', 'B5_A_Dm', 'B5_N_Dm',
       'Verified_status_True', 'Verified_status_False', 'O', 'C', 'E', 'A',
       'N', 'Real_or_Fake_tweet'],
      dtype='object')
In [1556]:
df.shape
Out[1556]:
(271, 52)
In [1557]:
df.isnull().sum()
Out[1557]:
Unnamed: 0                    0
Open                          0
High                          0
Low                           0
Close                         0
Adj Close                     0
Volume                        0
Return                        0
Beta                          0
Variance                      0
AvgTrueRange                  0
Upperband                     0
Lowerband                     0
Middleband                    0
APO                           1
NATR                          0
TRANGE                        0
DMI                           0
MACD                          9
MACDSIGNAL                    9
MACDHIST                      9
MOM                           0
PPO                           1
ROCP                          0
RSI                           0
TRIX                         64
ULTOSC                        4
SLOWK                         0
SLOWD                         0
AD                            0
ADOSC                         0
OBV                           0
Upward_momentum_created       0
Downward_momentum_created     0
B5_O_Um                       0
B5_C_Um                       0
B5_E_Um                       0
B5_A_Um                       0
B5_N_Um                       0
B5_O_Dm                       0
B5_C_Dm                       0
B5_E_Dm                       0
B5_A_Dm                       0
B5_N_Dm                       0
Verified_status_True          0
Verified_status_False         0
O                             0
C                             0
E                             0
A                             0
N                             0
Real_or_Fake_tweet            0
dtype: int64
In [1558]:
df = df.fillna(df.median())
del df['Unnamed: 0']
df.rename(columns = {'Real_or_Fake_tweet': 'Fake_news'}, inplace = True)
In [1559]:
df_weekly = df.resample('W').agg('mean')
In [1560]:
df_weekly.shape
Out[1560]:
(57, 51)
In [1561]:
plt.figure(figsize=(40,15))
sns.heatmap(df_weekly.corr(),annot=True)
Out[1561]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f07839b8950>
In [1562]:
sns.set(font_scale=0.8)
In [1563]:
df_weekly.hist(figsize=(20, 32), bins=50, xlabelsize=8, ylabelsize=8);
In [1564]:
df_corr = df_weekly.corr()['AvgTrueRange'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with AvgTrueRange:\n{}".format(len(golden_features_list), golden_features_list))
There are 19 strongly correlated values with AvgTrueRange:
AvgTrueRange                 1.000000
Variance                     0.834722
MACDSIGNAL                   0.785677
NATR                         0.784625
MACD                         0.741433
TRANGE                       0.680592
PPO                          0.673722
APO                          0.627183
Upperband                    0.613702
Downward_momentum_created    0.608610
B5_O_Dm                      0.608610
B5_E_Dm                      0.608610
B5_N_Dm                      0.608610
Verified_status_False        0.574304
N                            0.574258
O                            0.574258
E                            0.574258
Fake_news                    0.574258
Verified_status_True         0.569250
Name: AvgTrueRange, dtype: float64
In [1565]:
df_corr = df_weekly.corr()['NATR'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with NATR :\n{}".format(len(golden_features_list), golden_features_list))
There are 6 strongly correlated values with NATR :
NATR            1.000000
AvgTrueRange    0.784625
Variance        0.685864
MACDSIGNAL      0.551584
PPO             0.518488
ADOSC          -0.570305
Name: NATR, dtype: float64
In [1566]:
df_corr = df_weekly.corr()['TRANGE'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with TRANGE:\n{}".format(len(golden_features_list), golden_features_list))
There are 21 strongly correlated values with TRANGE:
TRANGE                       1.000000
Verified_status_False        0.938149
Fake_news                    0.937866
E                            0.937866
O                            0.937866
N                            0.937866
B5_N_Dm                      0.928371
B5_E_Dm                      0.928371
B5_O_Dm                      0.928371
Downward_momentum_created    0.928371
Verified_status_True         0.919571
ROCP                         0.888100
Variance                     0.754984
Volume                       0.695726
AvgTrueRange                 0.680592
MACD                         0.667107
MOM                          0.656165
Return                       0.619463
MACDHIST                     0.594328
PPO                          0.558267
High                         0.500715
Name: TRANGE, dtype: float64
In [1567]:
df_corr = df_weekly.corr()['O'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Openness:\n{}".format(len(golden_features_list), golden_features_list))
There are 17 strongly correlated values with Openness:
Fake_news                    1.000000
N                            1.000000
E                            1.000000
O                            1.000000
Verified_status_False        0.999998
Verified_status_True         0.995290
Downward_momentum_created    0.992827
B5_O_Dm                      0.992827
B5_E_Dm                      0.992827
B5_N_Dm                      0.992827
TRANGE                       0.937866
ROCP                         0.910877
Variance                     0.759553
Volume                       0.712390
Return                       0.666431
AvgTrueRange                 0.574258
MOM                          0.544475
Name: O, dtype: float64
In [1568]:
df_corr = df_weekly.corr()['C'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: C, dtype: float64)
In [1569]:
df_corr = df_weekly.corr()['E'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 17 strongly correlated values with conscientiousness:
Fake_news                    1.000000
N                            1.000000
E                            1.000000
O                            1.000000
Verified_status_False        0.999998
Verified_status_True         0.995290
Downward_momentum_created    0.992827
B5_O_Dm                      0.992827
B5_E_Dm                      0.992827
B5_N_Dm                      0.992827
TRANGE                       0.937866
ROCP                         0.910877
Variance                     0.759553
Volume                       0.712390
Return                       0.666431
AvgTrueRange                 0.574258
MOM                          0.544475
Name: E, dtype: float64
In [1570]:
df_corr = df_weekly.corr()['A'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: A, dtype: float64)
In [1571]:
df_corr = df_weekly.corr()['N'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 17 strongly correlated values with conscientiousness:
Fake_news                    1.000000
N                            1.000000
E                            1.000000
O                            1.000000
Verified_status_False        0.999998
Verified_status_True         0.995290
Downward_momentum_created    0.992827
B5_O_Dm                      0.992827
B5_E_Dm                      0.992827
B5_N_Dm                      0.992827
TRANGE                       0.937866
ROCP                         0.910877
Variance                     0.759553
Volume                       0.712390
Return                       0.666431
AvgTrueRange                 0.574258
MOM                          0.544475
Name: N, dtype: float64
In [1572]:
df_corr = df_weekly.corr()['B5_O_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_O_Um:
Series([], Name: B5_O_Um, dtype: float64)
In [1573]:
df_corr = df_weekly.corr()['B5_C_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_C_Um:
Series([], Name: B5_C_Um, dtype: float64)
In [1574]:
df_corr = df_weekly.corr()['B5_E_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_E_Um:
Series([], Name: B5_E_Um, dtype: float64)
In [1575]:
df_corr = df_weekly.corr()['B5_A_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Um:
Series([], Name: B5_A_Um, dtype: float64)
In [1576]:
df_corr = df_weekly.corr()['B5_N_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_N_Um:
Series([], Name: B5_N_Um, dtype: float64)

Downward momentum correlation

In [1577]:
df_corr = df_weekly.corr()['B5_O_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 17 strongly correlated values with B5_O_Dm:
B5_O_Dm                      1.000000
B5_N_Dm                      1.000000
Downward_momentum_created    1.000000
B5_E_Dm                      1.000000
Verified_status_True         0.997526
N                            0.992827
Fake_news                    0.992827
O                            0.992827
E                            0.992827
Verified_status_False        0.992632
TRANGE                       0.928371
ROCP                         0.891693
Variance                     0.784131
Volume                       0.730821
Return                       0.632910
AvgTrueRange                 0.608610
MOM                          0.520204
Name: B5_O_Dm, dtype: float64
In [1578]:
df_corr = df_weekly.corr()['B5_C_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_C_Dm:
Series([], Name: B5_C_Dm, dtype: float64)
In [1579]:
df_corr = df_weekly.corr()['B5_E_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 17 strongly correlated values with B5_E_Dm:
B5_O_Dm                      1.000000
B5_N_Dm                      1.000000
Downward_momentum_created    1.000000
B5_E_Dm                      1.000000
Verified_status_True         0.997526
N                            0.992827
Fake_news                    0.992827
O                            0.992827
E                            0.992827
Verified_status_False        0.992632
TRANGE                       0.928371
ROCP                         0.891693
Variance                     0.784131
Volume                       0.730821
Return                       0.632910
AvgTrueRange                 0.608610
MOM                          0.520204
Name: B5_E_Dm, dtype: float64
In [1580]:
df_corr = df_weekly.corr()['B5_A_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Dm:
Series([], Name: B5_A_Dm, dtype: float64)
In [1581]:
df_corr = df_weekly.corr()['B5_N_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 17 strongly correlated values with B5_N_Dm:
B5_O_Dm                      1.000000
B5_N_Dm                      1.000000
Downward_momentum_created    1.000000
B5_E_Dm                      1.000000
Verified_status_True         0.997526
N                            0.992827
Fake_news                    0.992827
O                            0.992827
E                            0.992827
Verified_status_False        0.992632
TRANGE                       0.928371
ROCP                         0.891693
Variance                     0.784131
Volume                       0.730821
Return                       0.632910
AvgTrueRange                 0.608610
MOM                          0.520204
Name: B5_N_Dm, dtype: float64
In [1582]:
df_corr = df_weekly.corr()['Fake_news'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Real_or_Fake_tweet :\n{}".format(len(golden_features_list), golden_features_list))
There are 17 strongly correlated values with Real_or_Fake_tweet :
Fake_news                    1.000000
N                            1.000000
E                            1.000000
O                            1.000000
Verified_status_False        0.999998
Verified_status_True         0.995290
Downward_momentum_created    0.992827
B5_O_Dm                      0.992827
B5_E_Dm                      0.992827
B5_N_Dm                      0.992827
TRANGE                       0.937866
ROCP                         0.910877
Variance                     0.759553
Volume                       0.712390
Return                       0.666431
AvgTrueRange                 0.574258
MOM                          0.544475
Name: Fake_news, dtype: float64
In [1583]:
df_corr = df_weekly.corr()['Downward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Downward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 17 strongly correlated values with Downward_momentum_created :
B5_O_Dm                      1.000000
B5_N_Dm                      1.000000
Downward_momentum_created    1.000000
B5_E_Dm                      1.000000
Verified_status_True         0.997526
N                            0.992827
Fake_news                    0.992827
O                            0.992827
E                            0.992827
Verified_status_False        0.992632
TRANGE                       0.928371
ROCP                         0.891693
Variance                     0.784131
Volume                       0.730821
Return                       0.632910
AvgTrueRange                 0.608610
MOM                          0.520204
Name: Downward_momentum_created, dtype: float64
In [1584]:
df_corr = df_weekly.corr()['Upward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Upward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with Upward_momentum_created :
Series([], Name: Upward_momentum_created, dtype: float64)
In [1585]:
df_corr = df_weekly.corr()['Verified_status_True'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_True :\n{}".format(len(golden_features_list), golden_features_list))
There are 17 strongly correlated values with Verified_status_True :
Verified_status_True         1.000000
B5_O_Dm                      0.997526
B5_N_Dm                      0.997526
Downward_momentum_created    0.997526
B5_E_Dm                      0.997526
N                            0.995290
Fake_news                    0.995290
O                            0.995290
E                            0.995290
Verified_status_False        0.995095
TRANGE                       0.919571
ROCP                         0.900564
Variance                     0.764988
Volume                       0.717447
Return                       0.648237
AvgTrueRange                 0.569250
MOM                          0.517950
Name: Verified_status_True, dtype: float64
In [1586]:
df_corr = df_weekly.corr()['Verified_status_False'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_False :\n{}".format(len(golden_features_list), golden_features_list))
There are 17 strongly correlated values with Verified_status_False :
Verified_status_False        1.000000
Fake_news                    0.999998
N                            0.999998
E                            0.999998
O                            0.999998
Verified_status_True         0.995095
Downward_momentum_created    0.992632
B5_O_Dm                      0.992632
B5_E_Dm                      0.992632
B5_N_Dm                      0.992632
TRANGE                       0.938149
ROCP                         0.910999
Variance                     0.759367
Volume                       0.712216
Return                       0.666739
AvgTrueRange                 0.574304
MOM                          0.544966
Name: Verified_status_False, dtype: float64
In [1587]:
sns.set(font_scale=0.8)
In [1588]:
for i in range(0, len(df_weekly.columns), 5):
    sns.pairplot(data=df_weekly,
                x_vars=df_weekly.columns[i:i+5],
                y_vars=['NATR'])
In [1589]:
df_weekly.fillna(0, inplace = True)
In [1590]:
df_weekly.dropna(inplace=True)
In [1591]:
corr = df_weekly.drop('Close', axis=1).corr() 
plt.figure(figsize=(12, 10))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            cmap='YlGnBu', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);

Weekly volatility distribution

In [1592]:
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(10,6))
    sns.distplot(df_weekly.NATR, norm_hist=True, fit=stats.norm,
                bins=50, ax=ax)
    plt.title('Weekly Volatility Distribution')
    
    plt.show();